Filter criteria:
Filter differentially expressed genes between autism and control (p-value < 0.05)
No samples are removed based on network connectivity z-scores
Remove first Principal Component
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 2928
## Number of samples: 86 (51 ASD, 35 controls)
First principal component explains over 89.7% of the total variance. Removing first component and keeping the next 10 (because original analysis was made with first 11).
reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
datExpr = data.frame(datExpr)
datExpr_pca = prcomp(datExpr, scale.=TRUE)
last_pc = 11
par(mfrow=c(1,2))
plot(summary(datExpr_pca)$importance[2,-1], type='b')
abline(v=last_pc, col='blue')
plot(summary(datExpr_pca)$importance[3,-1]-summary(datExpr_pca)$importance[3,1], type='b')
abline(h=summary(datExpr_pca)$importance[3,last_pc]-summary(datExpr_pca)$importance[3,1], col='blue')
print(glue('Removing first PC and keeping next 10 components, explaining ',
(summary(datExpr_pca)$importance[3,last_pc]-summary(datExpr_pca)$importance[3,1])*100,
'% of the variance'))
datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:glue('PC',last_pc))
return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}
reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)
## Removing first PC and keeping next 10 components, explaining 8.367% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output
new_colnames = colnames(datExpr_redDim)[-ncol(datExpr_redDim)]
datExpr_redDim = datExpr_redDim %>% dplyr::select(-PC1)
colnames(datExpr_redDim) = new_colnames
rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr, new_colnames)
Two clouds of points?
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point() + theme_minimal()
Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant. Colouring by their log2 fold change we can see that the genes from the cloud on the top are overexpressed and the genes in the bottom one underexpressed.
load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')
# Remove Subject with ID = 'AN03345'
keep = datMeta$Subject_ID!='AN03345'
datMeta = datMeta[keep,]
datExpr = datExpr[,keep]
# Calculate differential expression p-value of each gene
mod = model.matrix(~ Dx, data=datMeta)
corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr))
ordered_top_genes = top_genes[match(rownames(datExpr), rownames(top_genes)),]
ASD_pvals = ordered_top_genes$adj.P.Val
log_fold_change = ordered_top_genes$logFC
pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame
p1 = pca_data_projection %>% ggplot(aes(x=PC2, y=PC3, color=ASD_pvals)) + geom_point(alpha=0.5) + theme_minimal()
p2 = pca_data_projection %>% ggplot(aes(x=PC2, y=PC3, color=log_fold_change)) + geom_point() + theme_minimal() +
scale_colour_gradient2()
grid.arrange(p1, p2, ncol=2)
rm(mod, corfit, lmfit, fit, ASD_pvals, log_fold_change, top_genes, ordered_top_genes, datExpr,
datProbes, datSeq, p1, p2, keep)
clusterings = list()
set.seed(123)
wss = sapply(1:15, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 4
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster
Chose k=6 as best number of clusters.
Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.
Younger ASD samples seem to be more similar to control samples than older ASD samples (pink cluster has most of the youngest samples). The yellow cluster is made of young ASD samples.
h_clusts = datExpr_redDim %>% dist %>% hclust
plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)
best_k = 4
clusterings[['hc']] = cutree(h_clusts, best_k)
Samples are grouped into two big clusters, one small, one of only six genes and two outliers, and two outliers, the first big cluster has three main subclusters, two small subclusters and two outliers, and the second one two main subclustersand five small subclusters.
*Output plots in clustering_genes_03_20 folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign genes to clusters with FDR<0.01 using the fdrtool package
ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]
clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c
Leaves most of the observations (~80%) without a cluster (same result as when considering the first PC):
ICA_clusters %>% rowSums %>% table
## .
## 0 1 2 3 4 5
## 2350 420 117 32 8 1
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) +
geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
best_power=21
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(1, 30, by=1))
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.7890 1.7600 0.818 1360.00 1450.00 1880.0
## 2 2 0.3970 0.4290 0.486 817.00 871.00 1390.0
## 3 3 0.0295 -0.0815 0.133 549.00 571.00 1090.0
## 4 4 0.3490 -0.3280 0.520 394.00 397.00 885.0
## 5 5 0.5340 -0.5000 0.660 295.00 285.00 735.0
## 6 6 0.6380 -0.6140 0.782 228.00 211.00 622.0
## 7 7 0.6630 -0.7190 0.810 181.00 161.00 534.0
## 8 8 0.7110 -0.7890 0.849 146.00 123.00 462.0
## 9 9 0.7450 -0.8490 0.893 120.00 97.00 404.0
## 10 10 0.7620 -0.8940 0.915 99.60 77.40 356.0
## 11 11 0.7750 -0.9340 0.927 83.80 62.40 316.0
## 12 12 0.7880 -0.9730 0.941 71.20 51.30 282.0
## 13 13 0.8050 -1.0000 0.950 61.00 42.40 253.0
## 14 14 0.8160 -1.0300 0.960 52.70 35.10 228.0
## 15 15 0.8240 -1.0600 0.964 45.90 29.80 207.0
## 16 16 0.8260 -1.1000 0.965 40.10 25.30 188.0
## 17 17 0.8300 -1.1300 0.966 35.30 21.70 172.0
## 18 18 0.8420 -1.1500 0.973 31.30 18.50 157.0
## 19 19 0.8390 -1.1800 0.970 27.80 16.00 144.0
## 20 20 0.8480 -1.1900 0.974 24.80 13.90 133.0
## 21 21 0.8580 -1.2100 0.979 22.20 12.20 123.0
## 22 22 0.8600 -1.2300 0.979 20.00 10.60 114.0
## 23 23 0.8650 -1.2400 0.981 18.00 9.35 105.0
## 24 24 0.8690 -1.2500 0.983 16.30 8.30 97.8
## 25 25 0.8730 -1.2600 0.987 14.80 7.38 91.1
## 26 26 0.8800 -1.2700 0.989 13.50 6.61 84.9
## 27 27 0.8810 -1.2800 0.990 12.30 5.91 79.4
## 28 28 0.8840 -1.2900 0.990 11.30 5.29 74.3
## 29 29 0.8840 -1.3000 0.990 10.40 4.75 69.6
## 30 30 0.8880 -1.3100 0.991 9.52 4.28 65.3
network = datExpr_redDim %>% t %>% blockwiseModules(power=best_power$powerEstimate, numericLabels=TRUE)
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)
It leaves 484 genes without a cluster (even more than with the first PC)
clusterings[['WGCNA']] %>% table
## .
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 484 423 384 337 329 179 147 132 120 64 55 54 46 36 32 32 30 24
## 18
## 20
Number of clusters that resemble more Gaussian mixtures = 32 but 25 clusters gets a very similar BIC value
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 25
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
Trying with 9 clusters
best_k = 9
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_2']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_2']] %>% table
## .
## 1 2 3 4 5 6 7 8 9
## 305 239 220 345 295 281 350 475 418
Trying with 2 clusters to see if it can separate the two clouds of points (it can’t)
best_k = 2
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_3']] = gmm$Log_likelihood %>% apply(1, which.max)
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM_3']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
Separate the two clouds of points by a straight line. There seems to be a difference in the mean expression of the genes between clusters but not in their standard deviation.
manual_clusters = as.factor(as.numeric(1.5*datExpr_redDim$PC1 + 0.5 > datExpr_redDim$PC2))
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2, color=manual_clusters)) + geom_point() +
geom_abline(slope=1.5, intercept=0.5, color='gray') + theme_minimal()
names(manual_clusters) = rownames(datExpr_redDim)
clusterings[['Manual']] = manual_clusters
Cluster 2 has a slightly bigger mean expression than cluster 1 and a smaller standard deviation.
manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd),
manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
p1 = manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
p2 = manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) +
geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output,
ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network,
best_power, c, manual_clusters, manual_clusters_data, c2_sd, gmm_c2_sd, p1, p2,
pca_data_projection)
Using Adjusted Rand Index:
cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
cluster1 = as.factor(clusterings[[i]])
for(j in (i):length(clusterings)){
cluster2 = as.factor(clusterings[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, cluster_sim)
create_2D_plot = function(cat_var){
ggplotly(plot_points %>% ggplot(aes_string(x='PC1', y='PC2', color=cat_var)) +
geom_point() + theme_minimal() +
xlab(paste0('PC1 (', round(summary(pca_output)$importance[2,1]*100,2),'%)')) +
ylab(paste0('PC2 (', round(summary(pca_output)$importance[2,2]*100,2),'%)')))
}
create_3D_plot = function(cat_var){
plot_points %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_points[,cat_var], size=1) %>%
layout(title = glue('Samples coloured by ', cat_var),
scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca_output)$importance[2,1]*100,2),'%)')),
yaxis=list(title=glue('PC2 (',round(summary(pca_output)$importance[2,2]*100,2),'%)')),
zaxis=list(title=glue('PC3 (',round(summary(pca_output)$importance[2,3]*100,2),'%)'))))
}
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), km_clust = as.factor(clusterings[['km']]),
hc_clust = as.factor(clusterings[['hc']]), cc_l1_clust = as.factor(clusterings[['cc_l1']]),
cc_clust = as.factor(clusterings[['cc_l2']]), ica_clust = as.factor(clusterings[['ICA_min']]),
n_ica_clust = as.factor(rowSums(ICA_clusters)), gmm_clust = as.factor(clusterings[['GMM']]),
gmm_clust2 = as.factor(clusterings[['GMM_2']]), gmm_clust3 = as.factor(clusterings[['GMM_3']]),
wgcna_clust = as.factor(clusterings[['WGCNA']]), manual_clust=as.factor(clusterings[['Manual']]))
Simple methods seem to only partition the space in buckets using information from the first component
All clusterings except K-means manage to separate the two clouds at least partially, but no one does a good job
WGCNA creates clusters inverted between clouds
Manual classification + GMM by standard deviation clusters make sense
create_2D_plot('km_clust')
create_2D_plot('hc_clust')
create_2D_plot('cc_l1_clust')
create_2D_plot('cc_clust')
create_2D_plot('ica_clust')
create_2D_plot('gmm_clust')
create_2D_plot('gmm_clust2')
create_2D_plot('gmm_clust3')
create_2D_plot('wgcna_clust')
create_3D_plot('ica_clust')
create_3D_plot('gmm_clust')
create_3D_plot('gmm_clust3')
create_3D_plot('wgcna_clust')